
volatile int enabled;

void BM_nop(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        asm volatile (INSN_NOP:::"memory");
    }
}
BENCHMARK(BM_nop);
// BENCHMARK(BM_nop)->ThreadPerCpu();

void BM_ub(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        asm volatile (
            "1: "INSN_JMP" 2f \n"
            "2:"
            :::"memory");
    }
}
BENCHMARK(BM_ub);
// BENCHMARK(BM_ub)->ThreadPerCpu();

void BM_bnez(benchmark::State& state) {
    volatile register int x = 1;

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        benchmark::DoNotOptimize(x);
        if (x != 0)
            benchmark_ClobberMemory();
    }
}
BENCHMARK(BM_bnez);
// BENCHMARK(BM_bnez)->ThreadPerCpu();

void BM_beqz(benchmark::State& state) {
    volatile register int x = 0;

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        benchmark::DoNotOptimize(x);
        if (x == 0)
            benchmark_ClobberMemory();
    }
}
BENCHMARK(BM_beqz);
// BENCHMARK(BM_beqz)->ThreadPerCpu();


#ifdef BRANCH_EXT_BENCHMARK

void BM_load_bnez(benchmark::State& state) {
    enabled = 1;

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        benchmark::DoNotOptimize(enabled);
        if (enabled != 0)
            benchmark_ClobberMemory();
    }
}
BENCHMARK(BM_load_bnez);
// BENCHMARK(BM_load_bnez)->ThreadPerCpu();

void BM_load_beqz(benchmark::State& state) {
    enabled = 0;

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        benchmark::DoNotOptimize(enabled);
        if (enabled == 0)
            benchmark_ClobberMemory();
    }
}
BENCHMARK(BM_load_beqz);
// BENCHMARK(BM_load_beqz)->ThreadPerCpu();

volatile int thread_start;
volatile int thread_exit;
struct th_data {
    int enabled;
    int type;
};
volatile struct th_data tdata;

enum {
    CACHE_MISS = 0,
    CACHE_BRANCH_MISS = 1,
    BRANCH_MISS = 2,
    NO_MISS = 3,
};

static void *thread_handler (void *data)
{
    volatile int *ptr = &enabled;
    struct th_data *td = (struct th_data *)data;
    long i = td->enabled;

    thread_start = 1;

    while (!thread_exit) {
        switch (td->type) {
            case CACHE_MISS:
                *ptr = i;
                break;
            case BRANCH_MISS:
                *ptr = 1 - i;
                break;
            case CACHE_BRANCH_MISS:
                i = 1 - i;
                *ptr = i;
                break;
            default:
                break;
        }
    }

    return NULL;
}

void BM_cache_miss_load_bnez(benchmark::State& state) {
    pthread_t th;

    enabled = 1;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = CACHE_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        if (enabled != 0)
            benchmark_ClobberMemory();
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_cache_miss_load_bnez);
//BENCHMARK(BM_cache_miss_load_bnez)->ThreadRange(1,3);
//BENCHMARK(BM_cache_miss_load_bnez)->ThreadPerCpu();

void BM_cache_miss_load_beqz(benchmark::State& state) {
    pthread_t th;
    long i;

    enabled = 0;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = CACHE_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        if (enabled == 0)
            benchmark_ClobberMemory();
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_cache_miss_load_beqz);
//BENCHMARK(BM_cache_miss_load_beqz)->ThreadRange(1,3);
//BENCHMARK(BM_cache_miss_load_beqz)->ThreadPerCpu();

void BM_branch_miss_load_bnez(benchmark::State& state) {
    pthread_t th;

    enabled = 1;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = BRANCH_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        if (enabled != 0)
            benchmark_ClobberMemory();
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_branch_miss_load_bnez);
//BENCHMARK(BM_branch_miss_load_bnez)->ThreadRange(1,3);
//BENCHMARK(BM_branch_miss_load_bnez)->ThreadPerCpu();

void BM_branch_miss_load_beqz(benchmark::State& state) {
    pthread_t th;
    long i;

    enabled = 0;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = BRANCH_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        if (enabled == 0)
            benchmark_ClobberMemory();
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_branch_miss_load_beqz);
//BENCHMARK(BM_branch_miss_load_beqz)->ThreadRange(1,3);
//BENCHMARK(BM_branch_miss_load_beqz)->ThreadPerCpu();

void BM_cache_branch_miss_load_bnez(benchmark::State& state) {
    pthread_t th;

    enabled = 1;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = CACHE_BRANCH_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        if (enabled != 0)
            benchmark_ClobberMemory();
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_cache_branch_miss_load_bnez);
//BENCHMARK(BM_cache_branch_miss_load_bnez)->ThreadRange(1,3);
//BENCHMARK(BM_cache_branch_miss_load_bnez)->ThreadPerCpu();

void BM_cache_branch_miss_load_beqz(benchmark::State& state) {
    pthread_t th;
    long i;

    enabled = 0;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = CACHE_BRANCH_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        if (enabled == 0)
            benchmark_ClobberMemory();
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_cache_branch_miss_load_beqz);
//BENCHMARK(BM_cache_branch_miss_load_beqz)->ThreadRange(1,3);
//BENCHMARK(BM_cache_branch_miss_load_beqz)->ThreadPerCpu();

#endif // BRANCH_EXT_BENCHMARK

// Ref: benchmark/test/donotoptimize_assembly_test.cc
int test_inc_integer(int input) {
    register int x = 0;
    benchmark::DoNotOptimize(x += input);
    return x;
}

void BM_inc(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_inc_integer(state.iterations());
    }
}
BENCHMARK(BM_inc);
// BENCHMARK(BM_inc)->ThreadPerCpu();

int test_dec_integer(int input) {
    register int x = 100;
    benchmark::DoNotOptimize(x -= input);
    return x;
}

void BM_dec(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_dec_integer(state.iterations());
    }
}
BENCHMARK(BM_dec);
// BENCHMARK(BM_dec)->ThreadPerCpu();

int test_mul_by_three(int input) {
    register int multiplier = 3;
    register int x = 0;
    benchmark::DoNotOptimize(x = input * multiplier);
    return x;
}

void BM_mul(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_mul_by_three(state.iterations());
    }
}
BENCHMARK(BM_mul);
// BENCHMARK(BM_mul)->ThreadPerCpu();

int test_div_by_three(int input) {
    register int divisor = 3;
    register int x = 0;
    benchmark::DoNotOptimize(x = input / divisor);
    return x;
}

void BM_div(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_div_by_three(state.iterations());
    }
}
BENCHMARK(BM_div);
// BENCHMARK(BM_div)->ThreadPerCpu();

// float support
// Ref: benchmark/test/donotoptimize_assembly_test.cc
float test_float_inc_integer(int input) {
    register float x = 0.1111;
    benchmark::DoNotOptimize(x += input);
    return x;
}

void BM_float_inc(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_float_inc_integer(state.iterations());
    }
}
BENCHMARK(BM_float_inc);
// BENCHMARK(BM_float_inc)->ThreadPerCpu();

float test_float_dec_integer(int input) {
    register float x = 100.1111;
    benchmark::DoNotOptimize(x -= input);
    return x;
}

void BM_float_dec(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_float_dec_integer(state.iterations());
    }
}
BENCHMARK(BM_float_dec);
// BENCHMARK(BM_float_dec)->ThreadPerCpu();

float test_float_mul_by_three(int input) {
    register float multiplier = 3.1111;
    register float x = 0.1111;
    benchmark::DoNotOptimize(x = input * multiplier);
    return x;
}

void BM_float_mul(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_float_mul_by_three(state.iterations());
    }
}
BENCHMARK(BM_float_mul);
// BENCHMARK(BM_float_mul)->ThreadPerCpu();

float test_float_div_by_three(int input) {
    register float divisor = 3.1111;
    register float x = 0.1111;
    benchmark::DoNotOptimize(x = input / divisor);
    return x;
}

void BM_float_div(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_float_div_by_three(state.iterations());
    }
}
BENCHMARK(BM_float_div);
// BENCHMARK(BM_float_div)->ThreadPerCpu();

// logic arithmetic support
int test_and_integer(int input) {
    register int x = 3;
    benchmark::DoNotOptimize(x = x && input);
    return x;
}

void BM_and(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_and_integer(state.iterations());
    }
}
BENCHMARK(BM_and);
// BENCHMARK(BM_and)->ThreadPerCpu();

int test_or_integer(int input) {
    register int x = 0;
    benchmark::DoNotOptimize(x = x || input);
    return x;
}

void BM_or(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_or_integer(state.iterations());
    }
}
BENCHMARK(BM_or);
// BENCHMARK(BM_or)->ThreadPerCpu();

int test_not_integer(int input) {
    register int x = 0;
    benchmark::DoNotOptimize(x = !input);
    return x;
}

void BM_not(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_not_integer(state.iterations());
    }
}
BENCHMARK(BM_not);
// BENCHMARK(BM_not)->ThreadPerCpu();

// logic bits arithmetic support
int test_bits_and_integer(int input) {
    register int x = 3004240;
    benchmark::DoNotOptimize(x = x & input);
    return x;
}

void BM_bits_and(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_bits_and_integer(state.iterations());
    }
}
BENCHMARK(BM_bits_and);
// BENCHMARK(BM_bits_and)->ThreadPerCpu();

int test_bits_or_integer(int input) {
    register int x = 3121321;
    benchmark::DoNotOptimize(x = x | input);
    return x;
}

void BM_bits_or(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_bits_or_integer(state.iterations());
    }
}
BENCHMARK(BM_bits_or);
// BENCHMARK(BM_bits_or)->ThreadPerCpu();

int test_bits_nor_integer(int input) {
    register int x = 3121321;
    benchmark::DoNotOptimize(x = x ^ input);
    return x;
}

void BM_bits_nor(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_bits_nor_integer(state.iterations());
    }
}
BENCHMARK(BM_bits_nor);
// BENCHMARK(BM_bits_nor)->ThreadPerCpu();

int test_bits_not_integer(int input) {
    register int x = 0;
    benchmark::DoNotOptimize(x = ~input);
    return x;
}

void BM_bits_not(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_bits_not_integer(state.iterations());
    }
}
BENCHMARK(BM_bits_not);
// BENCHMARK(BM_bits_not)->ThreadPerCpu();

int test_bits_rshift_integer(int input) {
    register int shift = 3;
    register int x = 0;
    benchmark::DoNotOptimize(x = input >> shift);
    return x;
}

void BM_bits_rshift(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_bits_rshift_integer(state.iterations());
    }
}
BENCHMARK(BM_bits_rshift);
// BENCHMARK(BM_bits_rshift)->ThreadPerCpu();

int test_bits_lshift_integer(int input) {
    register int shift = 3;
    register int x = 0;
    benchmark::DoNotOptimize(x = input << shift);
    return x;
}

void BM_bits_lshift(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        (void)test_bits_lshift_integer(state.iterations());
    }
}
BENCHMARK(BM_bits_lshift);
// BENCHMARK(BM_bits_lshift)->ThreadPerCpu();

// test loops
void test_for_loop(int input) {
    int i;

    for (i = 1; i <= input; i++)
        benchmark_ClobberMemory();
}

void BM_for_loop(benchmark::State& state) {
    register int n = 10;
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        test_for_loop(n);
    }
}
BENCHMARK(BM_for_loop);
// BENCHMARK(BM_for_loop)->ThreadPerCpu();

void test_while_loop(int input) {
    int i = 1;

    while (i <= input) {
        i++;
        benchmark_ClobberMemory();
    }
}

void BM_while_loop(benchmark::State& state) {
    register int n = 10;
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        test_while_loop(n);
    }
}
BENCHMARK(BM_while_loop);
// BENCHMARK(BM_while_loop)->ThreadPerCpu();

void test_do_while_loop(int input) {
    int i = 0;

    do {
        i++;
        benchmark_ClobberMemory();
    } while (i < input);
}

void BM_do_while_loop(benchmark::State& state) {
    register int n = 10;
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        test_do_while_loop(n);
    }
}
BENCHMARK(BM_do_while_loop);
// BENCHMARK(BM_do_while_loop)->ThreadPerCpu();

// Ref: benchmark/test/complexity_test.cc
void BENCHMARK_NOINLINE test_bubble_sort(int arr[], int len) {
    int i, j, t;

    for (i = 0; i < len - 1; i++) {
        for (j = 0; j < len - 1 - i; j++) {
            if (arr[j] > arr[j + 1]) {
                t = arr[j];
                arr[j] = arr[j + 1];
                arr[j + 1] = t;
            }
        }
    }
}

static void BM_bubble_sort(benchmark::State &state) {
    int arr[] = { 53, 42, 2, 25, 99, 52, 77, 30, 37, 10 };
    int len = (int) sizeof(arr) / sizeof(*arr);

    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        test_bubble_sort(arr, len);
    }

#if 0
    std::stringstream ss;
    int i;
    for (i = 0; i < len - 1; i++) {
      ss << arr[i];
      ss << ' ';
    }
    state.SetLabel(ss.str());
#endif
}

BENCHMARK(BM_bubble_sort);
// BENCHMARK(BM_sort)->ThreadPerCpu();

std::vector<int> ConstructRandomVector(int64_t size) {
  std::vector<int> v;
  v.reserve(static_cast<size_t>(size));
  for (int i = 0; i < size; ++i)
    v.push_back(static_cast<int>(std::rand() % 100));

  return v;
}

static void BM_std_sort(benchmark::State &state) {
  auto v = ConstructRandomVector(10);
  for (auto _ : state) {
      benchmark_DoNotOptimize_Iterations();
      std::sort(v.begin(), v.end());
  }

#if 0
    std::stringstream ss;
    int i;
    for (i = 0; i < 9; i++) {
      ss << v[i];
      ss << ' ';
    }
    state.SetLabel(ss.str());
#endif

}

BENCHMARK(BM_std_sort);
// BENCHMARK(BM_std_sort)->ThreadPerCpu();

// Ref: benchmark/test/benchmark_test.cc
// Pi/4 = 1 - 1/3 + 1/5 - 1/7 + 1/9 - ...
double test_calculate_pi(int depth) {
    double pi = 0.0;

    for (int i = 0; i < depth; ++i) {
        double numerator = static_cast<double>(((i % 2) * 2) - 1);
        double denominator = static_cast<double>((2 * i) - 1);
        pi += numerator / denominator;
    }
    return (pi - 1.0) * 4;
}

static void BM_calculate_pi(benchmark::State& state) {
    register double pi = 0.0;
    static const int depth = 128;
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        benchmark::DoNotOptimize(pi = test_calculate_pi(static_cast<int>(depth)));
    }

#if 0
    std::stringstream ss;
    ss << pi;
    state.SetLabel(ss.str());
#endif
}

BENCHMARK(BM_calculate_pi);
// BENCHMARK(BM_calculate_pi)->ThreadPerCpu();

// Ref: benchmark/test/benchmark_test.cc
int BENCHMARK_NOINLINE test_factorial(int n) {
    register int ret = 0;
    benchmark::DoNotOptimize(ret = (n == 1) ? 1 : n * test_factorial(n - 1));
    return ret;
}

static void BM_factorial(benchmark::State& state) {
    register int fac_42 = 0;
    register int n = 8;
    for (auto _ : state) {
        benchmark_DoNotOptimize_Iterations();
        fac_42 = test_factorial(n);
    }
    // Prevent compiler optimizations
#if 0
    std::stringstream ss;
    ss << fac_42;
    state.SetLabel(ss.str());
#endif
}
BENCHMARK(BM_factorial);
//BENCHMARK(BM_factorial)->ThreadPerCpu();

BENCHMARK_MAIN();
